{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "227e221d-cb4c-4b52-9c4f-2bcff51b00a5",
   "metadata": {},
   "source": [
    "# This exercise is using selenium to render websites, read their page sources, and then passes on the source code to OpenAI. It then uses the model to identify and find potential vulnerabilities and security gaps in that source."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dddabc12-ce06-45c1-875c-ab7e32b94e10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown, display\n",
    "from openai import OpenAI\n",
    "\n",
    "# If you get an error running this cell, then please head over to the troubleshooting notebook!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ef28b0bd-f11f-4b2a-88b4-112f932c9132",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "API key found and looks good so far!\n"
     ]
    }
   ],
   "source": [
    "# Load environment variables in a file called .env\n",
    "\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv('OPENAI_API_KEY')\n",
    "\n",
    "# Check the key\n",
    "\n",
    "if not api_key:\n",
    "    print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
    "elif not api_key.startswith(\"sk-proj-\"):\n",
    "    print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
    "elif api_key.strip() != api_key:\n",
    "    print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
    "else:\n",
    "    print(\"API key found and looks good so far!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d8f1af3b-c748-41f0-95f3-e21f512e7539",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai = OpenAI()\n",
    "\n",
    "# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
    "# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a80c8acf-8f8b-43ed-9473-698d33e74ed2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: selenium in /root/anaconda3/envs/llms/lib/python3.11/site-packages (4.32.0)\n",
      "Requirement already satisfied: urllib3<3,>=1.26 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (2.4.0)\n",
      "Requirement already satisfied: trio~=0.17 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from selenium) (0.30.0)\n",
      "Requirement already satisfied: trio-websocket~=0.9 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from selenium) (0.12.2)\n",
      "Requirement already satisfied: certifi>=2021.10.8 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from selenium) (2025.1.31)\n",
      "Requirement already satisfied: typing_extensions~=4.9 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from selenium) (4.13.2)\n",
      "Requirement already satisfied: websocket-client~=1.8 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from selenium) (1.8.0)\n",
      "Requirement already satisfied: attrs>=23.2.0 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio~=0.17->selenium) (25.3.0)\n",
      "Requirement already satisfied: sortedcontainers in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio~=0.17->selenium) (2.4.0)\n",
      "Requirement already satisfied: idna in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio~=0.17->selenium) (3.10)\n",
      "Requirement already satisfied: outcome in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.0.post0)\n",
      "Requirement already satisfied: sniffio>=1.3.0 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio~=0.17->selenium) (1.3.1)\n",
      "Requirement already satisfied: wsproto>=0.14 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from trio-websocket~=0.9->selenium) (1.2.0)\n",
      "Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n",
      "Requirement already satisfied: h11<1,>=0.9.0 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "068b4938-3020-4406-a305-500bcf46f7f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: webdriver-manager in /root/anaconda3/envs/llms/lib/python3.11/site-packages (4.0.2)\n",
      "Requirement already satisfied: requests in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from webdriver-manager) (2.32.3)\n",
      "Requirement already satisfied: python-dotenv in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from webdriver-manager) (1.1.0)\n",
      "Requirement already satisfied: packaging in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from webdriver-manager) (24.2)\n",
      "Requirement already satisfied: charset_normalizer<4,>=2 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from requests->webdriver-manager) (3.4.1)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from requests->webdriver-manager) (3.10)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from requests->webdriver-manager) (2.4.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /root/anaconda3/envs/llms/lib/python3.11/site-packages (from requests->webdriver-manager) (2025.1.31)\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install webdriver-manager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "417fae16-d2c9-425c-bd27-86996b3a1f7f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2025-05-17 15:27:43--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb\n",
      "Resolving dl.google.com (dl.google.com)... 74.125.193.136, 74.125.193.190, 74.125.193.93, ...\n",
      "Connecting to dl.google.com (dl.google.com)|74.125.193.136|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 116499092 (111M) [application/x-debian-package]\n",
      "Saving to: ‘google-chrome-stable_current_amd64.deb.5’\n",
      "\n",
      "google-chrome-stabl 100%[===================>] 111.10M  6.34MB/s    in 21s     \n",
      "\n",
      "2025-05-17 15:28:05 (5.18 MB/s) - ‘google-chrome-stable_current_amd64.deb.5’ saved [116499092/116499092]\n",
      "\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "Note, selecting 'google-chrome-stable' instead of './google-chrome-stable_current_amd64.deb'\n",
      "google-chrome-stable is already the newest version (136.0.7103.59-1).\n",
      "The following packages were automatically installed and are no longer required:\n",
      "  htop libnl-3-200 libnl-genl-3-200\n",
      "Use 'sudo apt autoremove' to remove them.\n",
      "0 upgraded, 0 newly installed, 0 to remove and 7 not upgraded.\n"
     ]
    }
   ],
   "source": [
    "# Step 1: Download the .deb package as a normal user\n",
    "!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb\n",
    "\n",
    "# Step 2: Install it with sudo\n",
    "!sudo apt install ./google-chrome-stable_current_amd64.deb\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cf4c5bcc-60ae-4f06-8052-f4c4398e0d5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/usr/bin/google-chrome\n",
      "Google Chrome 136.0.7103.59 \n"
     ]
    }
   ],
   "source": [
    "!which google-chrome\n",
    "!google-chrome --version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "959b13d9-374f-4cf8-9bde-f197c39500b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.service import Service\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "from webdriver_manager.chrome import ChromeDriverManager\n",
    "\n",
    "# options = Options()\n",
    "# options.binary_location = \"/usr/bin/google-chrome\"  # Or wherever `which google-chrome` points\n",
    "# options.add_argument(\"--headless\")\n",
    "# options.add_argument(\"--no-sandbox\")\n",
    "# options.add_argument(\"--disable-dev-shm-usage\")\n",
    "\n",
    "# service = Service(ChromeDriverManager().install())\n",
    "# driver = webdriver.Chrome(service=service, options=options)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97227a23-e367-498c-8190-7559b4d08e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Get page source\n",
    "# url = \"https://nohello.net\"\n",
    "# driver.get(url)\n",
    "# page_source = driver.page_source\n",
    "# driver.quit()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2319710e-10a4-4964-acec-276ad43442c0",
   "metadata": {},
   "source": [
    "# Selenium setup done. Definiing website class and other objects below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4683ed7d-6a1e-4d68-b951-27ed6f5d00a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
    "\n",
    "system_prompt = \"You are an assistant that analyzes the page source of a website and identifies potentila vulnerabilities and security gaps in the page source code and gives a short one liner on what should be done about it. Respond in markdown\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f28982e8-dd3c-4a64-8745-a31709a5d737",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Website:\n",
    "\n",
    "    def __init__(self, url):\n",
    "        \"\"\"\n",
    "        Create this Website object from the given url using the Selenium library\n",
    "        \"\"\"\n",
    "\n",
    "        options = Options()\n",
    "        options.binary_location = \"/usr/bin/google-chrome\"  # Or wherever `which google-chrome` points\n",
    "        options.add_argument(\"--headless\")\n",
    "        options.add_argument(\"--no-sandbox\")\n",
    "        options.add_argument(\"--disable-dev-shm-usage\")\n",
    "\n",
    "        service = Service(ChromeDriverManager().install())\n",
    "        driver = webdriver.Chrome(service=service, options=options)\n",
    "        \n",
    "        self.url = url\n",
    "        driver.get(url)\n",
    "        self.page_title = driver.title\n",
    "        self.page_source = driver.page_source\n",
    "        driver.quit()\n",
    "    \n",
    "        # response = requests.get(url, headers=headers)\n",
    "        # soup = BeautifulSoup(response.content, 'html.parser')\n",
    "        # self.title = soup.title.string if soup.title else \"No title found\"\n",
    "        # for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "        #     irrelevant.decompose()\n",
    "        # self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a24a695c-6e86-4efe-83ff-91d24373e171",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try one out. Change the website and add print statements to follow along.\n",
    "\n",
    "testweb = Website(\"https://nohello.net\")\n",
    "# print(testweb.page_title)\n",
    "# print(testweb.page_source)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2b582bea-d9fe-4f74-8207-31bdea9b312c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function that writes a User Prompt that asks for summaries of websites:\n",
    "\n",
    "def user_prompt_for(website):\n",
    "    user_prompt = f\"You are looking at a website titled {website.page_title}\"\n",
    "    user_prompt += \"\\nThe contents of this website is as follows; please analyze the page source on this website in detail and identify potential vulnerabilites and security gaps that can be fixed.\\n\\n\"\n",
    "    user_prompt += website.page_source\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a652eb76-3c2d-404b-91fa-3f1d9af8af84",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(user_prompt_for(testweb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ec73d3ad-3239-4686-84ac-44f0b10bce67",
   "metadata": {},
   "outputs": [],
   "source": [
    "# See how this function creates exactly the format above\n",
    "\n",
    "def messages_for(website):\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": system_prompt},\n",
    "        {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9888b6be-4876-4eb7-a1c7-6980b7421b66",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Try this out, and then try for a few more websites\n",
    "\n",
    "messages_for(testweb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6f1978e7-dcf5-4230-a8c1-b65ba0592c12",
   "metadata": {},
   "outputs": [],
   "source": [
    "# And now: call the OpenAI API. You will get very familiar with this!\n",
    "\n",
    "def analyze_code(url):\n",
    "    website = Website(url)\n",
    "    response = openai.chat.completions.create(\n",
    "        model = \"gpt-4o-mini\",\n",
    "        messages = messages_for(website)\n",
    "    )\n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2664ab62-3c9d-443b-a2d2-c3bb285500c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "analyze_code(\"https://nohello.net\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a840a848-d1c9-421c-ad39-e84584714c2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function to display this nicely in the Jupyter output, using markdown\n",
    "\n",
    "def display_results(url):\n",
    "    analysis = analyze_code(url)\n",
    "    display(Markdown(analysis))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "81404426-3fa6-415b-a6d0-787aeb165613",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "# Security Analysis of the \"no hello\" Website\n",
       "\n",
       "Here are the potential vulnerabilities and security gaps observed in the page source of the \"no hello\" website, along with recommendations for each:\n",
       "\n",
       "1. **Inline JavaScript and CSS:** \n",
       "   - **Issue:** Inline styles and scripts can lead to security vulnerabilities, like Cross-Site Scripting (XSS).\n",
       "   - **Recommendation:** Move all inline JS and CSS to external files and ensure they are minimized.\n",
       "\n",
       "2. **Lack of Content Security Policy (CSP):** \n",
       "   - **Issue:** No CSP header is defined, increasing the risk of XSS attacks.\n",
       "   - **Recommendation:** Implement a Content Security Policy to restrict sources of scripts and styles.\n",
       "\n",
       "3. **Local Storage Usage:**\n",
       "   - **Issue:** Using `localStorage` for language preference can expose it to XSS if not properly sanitized.\n",
       "   - **Recommendation:** Ensure any data written to or read from `localStorage` is properly sanitized.\n",
       "\n",
       "4. **HTTP Content Security Headers Missing:**\n",
       "   - **Issue:** Missing headers like `X-Content-Type-Options`, `X-Frame-Options`, etc.\n",
       "   - **Recommendation:** Implement additional security headers to mitigate common threats.\n",
       "\n",
       "5. **Image URLs with Unsecured Path:**\n",
       "   - **Issue:** The image sources use double slashes which could result in unintended behavior.\n",
       "   - **Recommendation:** Ensure image URLs are absolute and formatted correctly to avoid resource loading issues.\n",
       "\n",
       "6. **External Script Source:**\n",
       "   - **Issue:** The site imports external scripts (like `typed.js`) from a CDN without integrity checks.\n",
       "   - **Recommendation:** Use the Subresource Integrity (SRI) attribute for external script imports.\n",
       "\n",
       "7. **Exposed Links:**\n",
       "   - **Issue:** External links in the content are not set to open in a new tab.\n",
       "   - **Recommendation:** Use `target=\"_blank\"` on external links to prevent potential tab-nabbing attacks.\n",
       "\n",
       "8. **Deprecated HTML Elements:**\n",
       "   - **Issue:** Use of some old HTML elements may lead to compatibility issues.\n",
       "   - **Recommendation:** Ensure HTML is up to date and complies with current standards.\n",
       "\n",
       "By addressing these vulnerabilities, the website can enhance its overall security posture and better protect user data."
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_results(\"https://nohello.net\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fdadf917-86e1-4694-b708-5a8ce9e050df",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
